2. Compare two arirports
summary(aov(log_dura ~ droplocation, data))
## Df Sum Sq Mean Sq F value Pr(>F)
## droplocation 1 409.4 409.4 5887 <2e-16 ***
## Residuals 6858 477.0 0.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
3. split data
JFK = data %>% filter(droplocation == "JFK")
LAG = data %>% filter(droplocation == "LAG")
4. split time window
model_wh_JFK = JFK %>%
mutate(time = ifelse(hour(pick) %in% c(0:6, 23), 1, ifelse(hour(pick) %in% c(7:10, 16:19), 2, 3)),
weekday = ifelse(weekdays(pick) %in% c("Saturday", "Sunday"), 1,
ifelse(weekdays(pick) %in% c("Monday","Tuesday","Wednesday"), 2,3)))
model_wh_JFK$time = factor(model_wh_JFK$time)
model_wh_JFK$weekday = factor(model_wh_JFK$weekday)
model_wh_LAG = LAG %>%
mutate(time = ifelse(hour(pick) %in% c(0:6, 23), 1, ifelse(hour(pick) %in% c(7:10, 16:19), 2, 3)),
weekday = ifelse(weekdays(pick) %in% c("Saturday", "Sunday"), 1,
ifelse(weekdays(pick) %in% c("Monday","Tuesday","Wednesday"), 2, 3)))
model_wh_LAG$time = factor(model_wh_LAG$time)
model_wh_LAG$weekday = factor(model_wh_LAG$weekday)
5. interaction plot
interaction.plot(model_wh_JFK$weekday,model_wh_JFK$time,model_wh_JFK$log_dura,
main = "Interaction Plot for JFK",
xlab = "Weekday", ylab = "log of duration", trace.label = "Time Zone")

interaction.plot(model_wh_LAG$weekday,model_wh_LAG$time,model_wh_LAG$log_dura,
main = "Interaction Plot for LAG",
xlab = "Weekday", ylab = "log of duration", trace.label = "Time Zone")

6. type III two way ANOVA
library(car)
lm_wh_JFK <- lm(log_dura ~ time * weekday, contrasts = list(time=contr.sum, weekday=contr.sum), data = model_wh_JFK)
Anova(lm_wh_JFK, type=3)
## Anova Table (Type III tests)
##
## Response: log_dura
## Sum Sq Df F value Pr(>F)
## (Intercept) 43280 1 917596.233 < 2.2e-16 ***
## time 52 2 549.239 < 2.2e-16 ***
## weekday 9 2 98.712 < 2.2e-16 ***
## time:weekday 4 4 20.781 < 2.2e-16 ***
## Residuals 148 3134
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm_wh_LAG <- lm(log_dura ~ time * weekday, contrasts = list(time=contr.sum, weekday=contr.sum), data = model_wh_LAG)
Anova(lm_wh_LAG, type=3)
## Anova Table (Type III tests)
##
## Response: log_dura
## Sum Sq Df F value Pr(>F)
## (Intercept) 38056 1 795529.73 < 2.2e-16 ***
## time 57 2 593.92 < 2.2e-16 ***
## weekday 14 2 144.82 < 2.2e-16 ***
## time:weekday 3 4 18.16 8.77e-15 ***
## Residuals 177 3708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
7. multiple comparison for interaction
model_wh_JFK$comb = paste(as.character(model_wh_JFK$time),as.character(model_wh_JFK$weekday)) %>% gsub(" ", "", .) %>% factor()
anova_JFK <- aov(log_dura ~ time + weekday + comb, model_wh_JFK)
agricolae::HSD.test(anova_JFK, "comb", alpha = 0.05)$group
## log_dura groups
## 23 4.032461 a
## 33 3.995427 ab
## 22 3.969308 b
## 32 3.963610 b
## 31 3.904146 c
## 21 3.780664 d
## 13 3.692944 e
## 12 3.632236 f
## 11 3.607276 f
model_wh_LAG$comb = paste(as.character(model_wh_LAG$time),as.character(model_wh_LAG$weekday)) %>% gsub(" ", "", .) %>% factor()
anova_LAG <- aov(log_dura ~ time + weekday + comb, model_wh_LAG)
agricolae::HSD.test(anova_LAG, "comb", alpha = 0.05)$group
## log_dura groups
## 33 3.561377 a
## 23 3.544810 a
## 32 3.471657 b
## 22 3.463655 b
## 31 3.393732 c
## 21 3.283494 d
## 13 3.206756 e
## 12 3.176610 ef
## 11 3.132125 f
8. linear regression
comb_data <- rbind(model_wh_JFK,model_wh_LAG)
comb_data$drop_num <- factor(ifelse(comb_data$droplocation == "JFK", "1", "0"))
fit <- lm(log_dura ~ drop_num + time * weekday, comb_data)
summary(fit)
##
## Call:
## lm(formula = log_dura ~ drop_num + time * weekday, data = comb_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63695 -0.15438 -0.02263 0.13393 1.09774
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.127933 0.010647 293.775 < 2e-16 ***
## drop_num1 0.484142 0.005322 90.968 < 2e-16 ***
## time2 0.162373 0.012664 12.822 < 2e-16 ***
## time3 0.280144 0.013575 20.636 < 2e-16 ***
## weekday2 0.037893 0.012472 3.038 0.00239 **
## weekday3 0.079711 0.013598 5.862 4.78e-09 ***
## time2:weekday2 0.144597 0.015857 9.119 < 2e-16 ***
## time3:weekday2 0.029698 0.016990 1.748 0.08051 .
## time2:weekday3 0.176176 0.017416 10.116 < 2e-16 ***
## time3:weekday3 0.049437 0.018414 2.685 0.00728 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2182 on 6850 degrees of freedom
## Multiple R-squared: 0.6321, Adjusted R-squared: 0.6317
## F-statistic: 1308 on 9 and 6850 DF, p-value: < 2.2e-16
9. plot
# histogram
p1 = origin_data %>% mutate(log.dur = log(duration)) %>%
ggplot(.,aes(x = log.dur))+
geom_histogram(color="black", fill="steelblue",binwidth = 0.1)+
scale_x_continuous(name="Log of duration")+
theme_bw(14)
# histogram
p2 = origin_data %>%
ggplot(.,aes(x = duration))+
geom_histogram(color="black", fill="steelblue",binwidth = 5)+
scale_x_continuous(name="Duration")+
theme_bw(14)
ggarrange(p2,p1,nrow = 2, ncol = 1)

# boxplot
origin_data %>%
plot_ly(., x = ~duration, type = "box") %>%
layout(title = "Boxplot of Durations", xaxis = list(title = "Durations"))
# smooth line
p3 = data %>% group_by(Rain) %>% summarise(duration.avg = mean(log_dura)) %>%
ggplot()+
geom_smooth(aes(Rain, duration.avg), method = "loess")+
xlab("Rain Levels")+
ylab("Log of Duration (Min)")+
ylim(3,4)+
ggtitle("Average Duration VS Rain Effect")+
theme_bw(14)
# smooth line
p4 = data %>%
ggplot()+
geom_smooth(aes(TMAX, log_dura), method = "loess", col = "red", se = F)+
xlab("Temperature")+
ylab("Log of Duration (Min)")+
ylim(3.25,3.75)+
ggtitle("Average Duration VS Temperature Effect")+
theme_bw(14)
grid.arrange(p3,p4)

10. Prediction based on our model
new = data.frame(drop_num = "1", time = "1", weekday = "1")
y.pred = exp(predict(fit, newdata = new, type = "response", interval = "predict"))
y.pred
## fit lwr upr
## 1 37.04285 24.13981 56.84273